#coding:utf-8

from urlparse import urljoin
from BeautifulSoup import *
import urllib2
import sys
from os.path import dirname,join
sys.path.append(join(dirname(__file__), '/home/yh/work/pymmseg-cpp'))
import mmseg

#page="http://fanqiang.chinaunix.net/2007-03-16/5127.shtml"
page="http://fanqiang.chinaunix.net/system/linux/2007-03-15/5110.shtml"

ignorewords=set([
    #'，'
    #,'。','？','；','★','nbsp','：','｀','！','＠','、','?'
    ])

try:
    c=urllib2.urlopen(page)
except:
    print "ERROR:Can't open url"
    exit(1)

def gettextonly(soup):
    v=soup.string
    if v==None:
        c=soup.contents
        resulttext=''
        for t in c:
            subtext=gettextonly(t)
            resulttext+=subtext+'\n'
        return resulttext
    else:
        return v.strip()

soup=BeautifulSoup(c.read())
text=gettextonly(soup)

mmseg.dict_load_defaults()
for word in text.split():
    #print word
    if isinstance(word,unicode):
        algor=mmseg.Algorithm(word.encode("utf-8"))
    for s in algor:
        word=s.text
        if word not in ignorewords:
            if not word.isalnum():
                if len(word)%3==0 and len(word)>3:
                    print word," ",len(word)





